{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "# Analysis of multivariate data\n",
    "\n",
    "- Regression line\n",
    "- Correlation\n",
    "\n",
    "Author:  Thomas Haslwanter, Date:    Jun-2017"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import scipy.stats as stats\n",
    "import pandas as pd\n",
    "from numpy.linalg import lstsq\n",
    "from urllib.request import urlopen\n",
    "import statsmodels.api as sm"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "## Regression Line\n",
    "\n",
    "Fit a line, using the powerful \"ordinary least square\" method of pandas.\n",
    "\n",
    "*Data from 24 type 1 diabetic patients, relating Fasting blood glucose (mmol/l) to mean circumferential shortening velocity (%/sec), derived form echocardiography.*"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [],
   "source": [
    "# Get the data\n",
    "url_base = 'https://raw.githubusercontent.com/thomas-haslwanter/statsintro_python/master/ipynb/Data/data_altman/'\n",
    "inFile = 'altman_11_6.txt'\n",
    "url = url_base + inFile\n",
    "data = np.genfromtxt(urlopen(url), delimiter=',')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "### Solve equations \"by hand\" ..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(array([1.098, 0.022]), array([0.986]), 2, array([54.079,  1.838]))\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXUAAAD4CAYAAAATpHZ6AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADt0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjByYzIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy/EUOrgAAARbUlEQVR4nO3df5BdZX3H8c8nbNwtGgTNRht+JLYjkUIh6JXQopVAW351Ep1pmULqD4qTUdQi05bgQGUc+aOV/tAMoyGDNLaBOK2CWiwWoa1kphDY8DMYIU7BJYLuZjItBCZb1nz7x72rm7B37727Z+9zznPfr5md3bvnzD3fubn3k2ef85zvcUQIAJCHeakLAAAUh1AHgIwQ6gCQEUIdADJCqANARvpSHXjhwoWxdOnSVIcHgEravn37nogYbLY9WagvXbpUQ0NDqQ4PAJVk+0fTbWf6BQAyQqgDQEYIdQDICKEOABkh1AEgI4Q6ABRg5IX9uvDG+zTy4v6kdRDqAFCA9ffs0oPP7NX6u3clrSPZOnUAyMGya+7U2PiBnz/evG1Ym7cNq79vnp687ryu18NIHQBmYeuVK7Vq+WINzK/H6cD8eVq9fLG2rluZpB5CHQBmYdERA1rQ36ex8QPq75unsfEDWtDfp0ULBpLUw/QLAMzSnn1jWrNiiS4+7Tjd+sCwRhOeLHWq29nVarWg9wsAdMb29oioNdvO9AsAZIRQB4CMEOoAkBFCHQAyQqgDQEYIdQDICKEOABkh1AEgI4Q6AGSEUAeAjBDqAJARQh0AMkKoA0BGCHUAyAihDgAZIdQBICOEOgBkhFAHgIwQ6gCQEUIdADJCqANARgh1AMgIoQ4AGSHUASAjhDoAZKRlqNu+2faI7R1Ntr/e9r/YftT2E7YvKb5MAEA72hmpb5J07jTbPybp+xFxiqQzJf2N7dfMvjQAQKdahnpE3Ctp73S7SFpg25Je19h3vJjyAACdKGJO/QZJJ0h6TtLjki6PiANT7Wh7re0h20Ojo6MFHBoAMFkRoX6OpEckLZa0XNINto+YaseI2BgRtYioDQ4OFnBoAMBkRYT6JZJui7ofSnpa0tsKeF4AQIeKCPVhSWdLku03SVom6b8LeF4AQIf6Wu1ge4vqq1oW2t4t6VpJ8yUpIjZI+qykTbYfl2RJ6yJiz5xVDABoqmWoR8RFLbY/J+l3C6sIADBjXFEKYFojL+zXhTfep5EX96cuBW0g1AFMa/09u/TgM3u1/u5dqUtBG1pOvwDoTcuuuVNj47+45GTztmFt3jas/r55evK68xJWhukwUgcwpa1XrtSq5Ys1ML8eEwPz52n18sXaum5l4sowHUIdwJQWHTGgBf19Ghs/oP6+eRobP6AF/X1atGAgdWmYBtMvAJras29Ma1Ys0cWnHadbHxjWKCdLS88RkeTAtVothoaGkhwbAKrK9vaIqDXbzvQLAGSEUAeAjBDqAJARQh0AMkKoA0BGCHUAyAihjjlBEyggDUIdc4ImUEAaXFGKQtEECkiLkToKRRMoIC1CHYWiCRSQFtMvKBxNoIB0aOgFABVCQy8A6CGEOgBkhFAHgIwQ6gCQEUIdADJCqANARgh1AMgIoQ4AGSHUASAjhDoAZIRQB4CMEOoAkBFCHQAyQqgDQEZahrrtm22P2N4xzT5n2n7E9hO2v1dsiQCAdrUzUt8k6dxmG20fKemLklZFxImS/qCY0gAAnWoZ6hFxr6S90+xysaTbImK4sf9IQbUBADpUxJz68ZKOsv2ftrfb/kCzHW2vtT1ke2h0dLSAQwMAJisi1PskvUPSBZLOkfQXto+faseI2BgRtYioDQ4OFnBoAMBkRdx4erekPRHxkqSXbN8r6RRJTxXw3ACADhQxUv+mpHfb7rN9uKQVknYW8LwAgA61HKnb3iLpTEkLbe+WdK2k+ZIUERsiYqft70h6TNIBSTdFRNPljwCAudMy1CPiojb2uV7S9YVUBACYMa4oBYCMEOoAkBFCHQAyQqgDQEYIdQDICKEOABkh1AEgI4Q6AGSEUAeAjBDqiYy8sF8X3nifRl7cn7oUABkh1BNZf88uPfjMXq2/e1fqUgBkpIjWu+jAsmvu1Nj4gZ8/3rxtWJu3Dau/b56evO68hJUByAEj9S7beuVKrVq+WAPz6y/9wPx5Wr18sbauW5m4MgA5INS7bNERA1rQ36ex8QPq75unsfEDWtDfp0ULBlKXBiADTL8ksGffmNasWKKLTztOtz4wrFFOlgIoiCMiyYFrtVoMDQ0lOfZURl7Yr49veVg3XHwqo2YApWV7e0TUmm1n+qWB1SgActDz0y+sRgGQk54fqbMaBUBOej7UWY0CoJvm+mryng916RerUW6/7AytWbFEo/vGUpcEIFNzff6O1S8A0AWHnr+b0On5O1a/AEAJdOv8HaEOAF3QrfN3Pb+kEQC6pRtXkzOnDgAVwpw6APQQQh0AMkKoA0BGCHUAyAihDgAZIdQBICOEOgBkhFAHgIy0DHXbN9sesb2jxX7vtP0z279fXHkAgE60M1LfJOnc6XawfZikv5L0bwXUBACYoZahHhH3StrbYrdPSPq6pJEiigIAzMys59RtHy3pfZI2tLHvWttDtodGR0dne2gAwCGKOFH6eUnrIuJnrXaMiI0RUYuI2uDgYAGHBgBMVkTr3Zqkr9qWpIWSzrc9HhHfKOC5AQAdmHWoR8RbJn62vUnSHQQ6AKTRzpLGLZLuk7TM9m7bl9r+iO2PzH156FVzfcd1IFctR+oRcVG7TxYRH5pVNUDD5DuuX/e+X09dDlAZ3M4OpXLoHdc3bxvW5m3DHd9xHehVtAlAqXTrjutArgh1lEq37rgO5IrpF5RON+64DuTKEZHkwLVaLYaGhpIcG/kYeWG/Pr7lYd1w8amM5tETbG+PiFqz7Uy/oNImr5IBwPQLKopVMsDUGKmjklglA0yNUEclsUoGmBqhjsqaWCVz+2VnaM2KJRrdN5a6pBmhJQKKxOoXILFrbn9ctzwwrDWnHUdLBLTUavULJ0pRCJYWdo6TvZgLTL+gECwt7BwnezEXGKljVhhtzhwnezEXGKljVhhtzk4uJ3tRHozUMSuMNmfnxvf/4nzXde89KWElyAWhjlmjARdQHixpBHAQVjKVGw29AHSElUzVRqiXXFFXG3LVIlpZds2dWnrVt7V527Ai6iuZll71bS275s7UpaEDhHrJFTVqYvSFVljJlAdOlJZUUeu/WUeOdrGSKQ+M1EuqqFEToy90gnXz1cdIvaSKGjUx+kInWDdffYzUE2nnxGVRoyZGX0DvYJ16IrRbBTATtN4tGU5cAphLTL90GScuAcwlQr3LOHEJYC4x/ZIADbAAzBVOlAIdoNkVUqOhF1Ag2i2g7Jh+AdrAqiVUBSN1oA2sWkJVtAx12zfbHrG9o8n2NbYfa3z9l+1Tii8TSItVS6iKdkbqmySdO832pyW9JyJOlvRZSRsLqAsoHdotoAraWv1ie6mkOyJi2g4/to+StCMijm71nKx+AYDOdXv1y6WSmt4mxfZa20O2h0ZHRws+9MxwRyAUgfcRyqKwULe9UvVQX9dsn4jYGBG1iKgNDg4WdehZYYkaisD7CGVRyPSL7ZMl3S7pvIh4qp0Dp55+OXSJ2oSqLFHjIphyqPr7CNUz59Mvto+TdJuk97cb6GVQ9SVqjAzLoervI+Sn5cVHtrdIOlPSQtu7JV0rab4kRcQGSZ+W9EZJX7QtSePT/S9SFlVdosZFMOVS1fcR8tUy1CPiohbbPyzpw4VV1EVVbKy19cqVuu5fd+quJ36i/a8c0MD8eTrnxDfr6gtOSF1az6ri+wj56uk2AVW8HyMjw/Kp4vsI+erpUK8qRoYAmqH1LgBUCK13AaCHEOoAkBFCHQAyQqgDQEYIdQDICKEOABmpXKjT4rS7eL2BaqlcqNPIqrt4vYFqqczFR7Q47S5eb6Ccsrn4iBan3cXrDVRTZUKdRlbdxesNVFOlGnrRyKq7eL2B6qnMnDrQCW73h1xlM6cOdIJVO+hVlZp+AVrhdn/odYzUkRVW7aDXEerICqt20OuYfkF2WLWDXsbqFwCoEFa/AEAPIdQBICOEOoA5Rfvm7iLUAcwpLgTrLla/AJgTXAiWBiN1AHOCC8HS6JlQZ14P6C4uBEujZ0KdeT2g+yYuBLv9sjO0ZsUSje4bS11S9rK/+IjbsgHISc9ffMS8HoBekn2oM68HoJf0xJJGGjwB6BUt59Rt3yzp9ySNRMRJU2y3pC9IOl/Sy5I+FBEPtTowDb0AoHNFzKlvknTuNNvPk/TWxtdaSV/qpEAAQHFahnpE3Ctp7zS7rJb0D1F3v6Qjbf9yUQUCANpXxInSoyU9O+nx7sbvXsX2WttDtodGR0cLODQAYLIiQt1T/G7KifqI2BgRtYioDQ4OFnBoAMBkRYT6bknHTnp8jKTnCnheAECHigj1b0n6gOtOl/S/EfF8Ac8LAOhQO0sat0g6U9JCST+VdK2k+ZIUERsaSxpvUH2FzMuSLomIlmsVbY9K+lGbdS6UtKfNfVOgvtmhvpkrc20S9c3WVPUtiYim89fJer90wvbQdOsyU6O+2aG+mStzbRL1zdZM6su+TQAA9BJCHQAyUpVQ35i6gBaob3aob+bKXJtEfbPVcX2VmFMHALSnKiN1AEAbCHUAyEglQt32YbYftn1H6loOZftI21+z/QPbO23/RuqaJti+wvYTtnfY3mI7+Z1BbN9se8T2jkm/e4Pt79re1fh+VIlqu77xb/uY7dttH5mitmb1Tdr2Z7bD9sIUtTVqmLI+25+w/WTjvfi5MtVne7nt+20/0uhLdVqi2o61/R+NDHnC9uWN33f82ahEqEu6XNLO1EU08QVJ34mIt0k6RSWp0/bRkv5EUq3RB/8wSX+YtipJU7dyvkrSPRHxVkn3NB6nsEmvru27kk6KiJMlPSXpU90uapJNmqINtu1jJf2OpOFuF3SITTqkPtsrVe/kenJEnCjprxPUNWGTXv36fU7SZyJiuaRPNx6nMC7pTyPiBEmnS/qY7V/TDD4bpQ9128dIukDSTalrOZTtIyT9lqQvS1JE/F9E/E/aqg7SJ+mXbPdJOlwl6MnTpJXzaklfafz8FUnv7WpRDVPVFhF3RcR44+H9qvc2SmKaNth/J+lKNWmk1y1N6vuopL+MiLHGPiNdL6yhSX0h6YjGz69Xos9IRDw/cXOhiHhR9cHh0ZrBZ6P0oS7p86q/YQ+kLmQKvyJpVNLfN6aHbrL92tRFSVJE/Fj1UdGwpOdV78lzV9qqmnrTRL+gxvdFietp5o8l3Zm6iMlsr5L044h4NHUtTRwv6d22t9n+nu13pi7oEJ+UdL3tZ1X/vKT8S0ySZHuppFMlbdMMPhulDnXbE7fR2566lib6JL1d0pci4lRJLynd1MFBGnNvqyW9RdJiSa+1/Udpq6ou21er/ifyLalrmWD7cElXqz5tUFZ9ko5SfUrhzyX9U6NfVFl8VNIVEXGspCvU+Ks7Fduvk/R1SZ+MiBdm8hylDnVJZ0haZfsZSV+VdJbtzWlLOshuSbsjYlvj8ddUD/ky+G1JT0fEaES8Iuk2Sb+ZuKZmfjpxt6zG92R/ok/F9gdVv0/vmijXhR2/qvp/2o82PiPHSHrI9puTVnWw3ZJua9wZ7QHV/+JOdjJ3Ch9U/bMhSf8sKcmJUkmyPV/1QL8lIiZq6vizUepQj4hPRcQxEbFU9ZN8/x4RpRltRsRPJD1re1njV2dL+n7CkiYblnS67cMbI6OzVZKTuFP4luofLjW+fzNhLQexfa6kdZJWRcTLqeuZLCIej4hFEbG08RnZLentjfdlWXxD0lmSZPt4Sa9RuboiPifpPY2fz5K0K0URjc/olyXtjIi/nbSp889GRFTiS/X2v3ekrmOKupZLGpL0mOpv4KNS1zSpts9I+oGkHZL+UVJ/CWraovoc/yuqh9Clkt6o+pn9XY3vbyhRbT9U/XaNjzS+NpTptTtk+zOSFpapPtVDfHPjPfiQpLNKVt+7JG2X9Kjqc9jvSFTbu1Q/afvYpPfa+TP5bNAmAAAyUurpFwBAZwh1AMgIoQ4AGSHUASAjhDoAZIRQB4CMEOoAkJH/Bz6ocqNXd2GzAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# First I have to delete rows containing \"nan\"\n",
    "a,b = np.where(np.isnan(data))\n",
    "data = np.delete(data, a, axis=0)\n",
    "\n",
    "x,y = data[:,0], data[:,1]\n",
    "plt.plot(x,y,'*')\n",
    "\n",
    "# Create the design matrix\n",
    "Xmat = sm.add_constant(x)\n",
    "\n",
    "# Calculate the parameters\n",
    "params = lstsq(Xmat, y, rcond=None)\n",
    "np.set_printoptions(precision=3)\n",
    "print(params)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "source": [
    "### ... then solve them with *pandas* and *statsmodels*\n",
    "\n",
    "pandas handles \"nan\" gracefully, and also provides more information about the fit. So let's use pandas, and compare the results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                 Results: Ordinary least squares\n",
      "=================================================================\n",
      "Model:              OLS              Adj. R-squared:     0.134   \n",
      "Dependent Variable: Vcf              AIC:                -3.1672 \n",
      "Date:               2020-04-14 18:22 BIC:                -0.8962 \n",
      "No. Observations:   23               Log-Likelihood:     3.5836  \n",
      "Df Model:           1                F-statistic:        4.414   \n",
      "Df Residuals:       21               Prob (F-statistic): 0.0479  \n",
      "R-squared:          0.174            Scale:              0.046957\n",
      "-------------------------------------------------------------------\n",
      "              Coef.    Std.Err.     t      P>|t|    [0.025   0.975]\n",
      "-------------------------------------------------------------------\n",
      "Intercept     1.0978     0.1175   9.3446   0.0000   0.8535   1.3421\n",
      "glucose       0.0220     0.0105   2.1010   0.0479   0.0002   0.0437\n",
      "-----------------------------------------------------------------\n",
      "Omnibus:              1.717        Durbin-Watson:           1.802\n",
      "Prob(Omnibus):        0.424        Jarque-Bera (JB):        1.270\n",
      "Skew:                 0.562        Prob(JB):                0.530\n",
      "Kurtosis:             2.756        Condition No.:           29   \n",
      "=================================================================\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import statsmodels.formula.api as smf\n",
    "\n",
    "# Convert them into a pandas DataFrame\n",
    "df = pd.DataFrame(data, columns=['glucose', 'Vcf'])\n",
    "\n",
    "model_fit = smf.ols('Vcf~glucose', df).fit()\n",
    "\n",
    "print(model_fit.summary2())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "## Correlation\n",
    "\n",
    "Pearson correlation, and two types of rank correlation (Spearman, Kendall)\n",
    "\n",
    "*Comparing age and percentage of body-fat (measured by dual-photon absorptiometry) for 18 normal adults.*"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [],
   "source": [
    "# Get the data\n",
    "inFile = 'altman_11_1.txt'\n",
    "url = url_base + inFile\n",
    "data = np.genfromtxt(urlopen(url), delimiter=',')\n",
    "\n",
    "x = data[:,0]\n",
    "y = data[:,1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "slideshow": {
     "slide_type": "subslide"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'pearson': 0.7920862321784911, 'spearman': 0.7538795855376156, 'kendall': 0.5762094850891228}\n"
     ]
    }
   ],
   "source": [
    "# Calculate correlations\n",
    "corr = {}\n",
    "corr['pearson'], _ = stats.pearsonr(x,y)\n",
    "corr['spearman'], _ = stats.spearmanr(x,y)\n",
    "corr['kendall'], _ = stats.kendalltau(x,y)\n",
    "\n",
    "print(corr)    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Spearman's rho = 0.754, and Pearson's r (rankordered) = 0.754\n"
     ]
    }
   ],
   "source": [
    "# Show that Spearman's rho is just the Pearson's R of the rank-ordered data\n",
    "r_rankordered = stats.pearsonr(stats.rankdata(x), stats.rankdata(y))[0]\n",
    "print(\"Spearman's rho = {0:5.3f}, and Pearson's r (rankordered) = {1:5.3f}\".format(corr['spearman'], r_rankordered))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "celltoolbar": "Slideshow",
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
